library(tidyverse)
library(nycflights13)
library(ggplot2)
library(readr)
library(here)
  1. Load the nycflights13::flights data set. Does it come formatted as a tibble?
data("flights")
class(flights)
## [1] "tbl_df"     "tbl"        "data.frame"
  1. Find all flights that
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dep_delay >= 60 & arr_delay <= dep_delay - 30)
## # A tibble: 2,074 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1716           1545        91     2140           2039
##  2  2013     1     1     2205           1720       285       46           2040
##  3  2013     1     1     2326           2130       116      131             18
##  4  2013     1     3     1503           1221       162     1803           1555
##  5  2013     1     3     1821           1530       171     2131           1910
##  6  2013     1     3     1839           1700        99     2056           1950
##  7  2013     1     3     1850           1745        65     2148           2120
##  8  2013     1     3     1923           1815        68     2036           1958
##  9  2013     1     3     1941           1759       102     2246           2139
## 10  2013     1     3     1950           1845        65     2228           2227
## # ... with 2,064 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights , between(dep_time, 0, 600) | dep_time == 2400)
## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Which flights took the longest and which the shortest?
slice(flights, which.max(air_time))
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     3    17     1337           1335         2     1937           1836
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
slice_max(flights, air_time)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     3    17     1337           1335         2     1937           1836
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
slice(flights, which.min(air_time))
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1    16     1355           1315        40     1442           1411
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
dplyr::slice_min(flights, air_time)
## # A tibble: 2 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1    16     1355           1315        40     1442           1411
## 2  2013     4    13      537            527        10      622            628
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
  1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay.
selection <- c("dep_time", "dep_delay", "arr_time", "arr_delay")
flights[ ,selection]
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
flights %>% select(selection)
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
  1. Create a new variable first_flight which takes the value TRUE if it is the first flight of the day.
flights <-  mutate(flights, first_flight = if_else(day - lag(day) != 0 | is.na(day - lag(day)), TRUE, FALSE )) 
filter(flights, first_flight == TRUE)
## # A tibble: 365 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     2       42           2359        43      518            442
##  3  2013     1     3       32           2359        33      504            442
##  4  2013     1     4       25           2359        26      505            442
##  5  2013     1     5       14           2359        15      503            445
##  6  2013     1     6       16           2359        17      451            442
##  7  2013     1     7       49           2359        50      531            444
##  8  2013     1     8      454            500        -6      625            648
##  9  2013     1     9        2           2359         3      432            444
## 10  2013     1    10        3           2359         4      426            437
## # ... with 355 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   first_flight <lgl>
flights %>%
  group_by(year, month, day) %>%
  mutate(first_flight = if_else(row_number() == 1, TRUE, FALSE)) %>%
  select(first_flight)
## # A tibble: 336,776 x 4
## # Groups:   year, month, day [365]
##     year month   day first_flight
##    <int> <int> <int> <lgl>       
##  1  2013     1     1 TRUE        
##  2  2013     1     1 FALSE       
##  3  2013     1     1 FALSE       
##  4  2013     1     1 FALSE       
##  5  2013     1     1 FALSE       
##  6  2013     1     1 FALSE       
##  7  2013     1     1 FALSE       
##  8  2013     1     1 FALSE       
##  9  2013     1     1 FALSE       
## 10  2013     1     1 FALSE       
## # ... with 336,766 more rows
  1. Find out in which district the most incidents were reported.
crime <- readr::read_csv(here::here('SoSe_2022/datasets/BostonCrime.csv'))

crime %>% 
  group_by(DISTRICT) %>%
  summarise(N = n()) %>% 
  arrange(desc(N))
## # A tibble: 13 x 2
##    DISTRICT     N
##    <chr>    <int>
##  1 B2       15658
##  2 C11      12957
##  3 D4       12454
##  4 B3       11560
##  5 A1       10976
##  6 C6        7516
##  7 D14       6159
##  8 E18       5517
##  9 E13       5480
## 10 E5        4095
## 11 A7        3800
## 12 A15       2034
## 13 <NA>       682
  1. Create an ordered list of the top ten districts according to the number of drug violations. Start with the district with most drug violations.
crime %>% 
  filter(OFFENSE_CODE_GROUP == "Drug Violation") %>% 
  group_by(DISTRICT) %>% 
  summarise(N = n()) %>% 
  arrange(desc(N)) 
## # A tibble: 13 x 2
##    DISTRICT     N
##    <chr>    <int>
##  1 B2         849
##  2 B3         695
##  3 C6         599
##  4 A1         518
##  5 C11        468
##  6 D4         455
##  7 E13        280
##  8 D14        222
##  9 A7         203
## 10 E18        174
## 11 E5         139
## 12 A15        118
## 13 <NA>        57
  1. What was the most dangerous hour of the week according to the number of homicides (OFFENSE_CODE_GROUP == ‘Homicide’)?
crime %>% 
  filter(OFFENSE_CODE_GROUP == "Homicide") %>% 
  group_by(HOUR, DAY_OF_WEEK) %>%
  summarise(N = n()) %>% 
  arrange(desc(N))
## # A tibble: 46 x 3
## # Groups:   HOUR [20]
##     HOUR DAY_OF_WEEK     N
##    <dbl> <chr>       <int>
##  1     0 Thursday        5
##  2     9 Tuesday         3
##  3    21 Friday          3
##  4    21 Monday          3
##  5    21 Saturday        3
##  6    22 Tuesday         3
##  7     0 Saturday        2
##  8     4 Sunday          2
##  9    13 Saturday        2
## 10    14 Thursday        2
## # ... with 36 more rows
  1. Calculate for each district the proportion of motor vehicle accidents to all reported incidents (OFFENSE_CODE_GROUP == ‘Motor Vehicle Accident Response’).
 crime %>% 
  group_by(DISTRICT) %>%  
  transmute(prop_accidents = sum(OFFENSE_CODE_GROUP == "Motor Vehicle Accident Response") / n()) 
## # A tibble: 98,888 x 2
## # Groups:   DISTRICT [13]
##    DISTRICT prop_accidents
##    <chr>             <dbl>
##  1 B3               0.102 
##  2 E18              0.134 
##  3 E5               0.139 
##  4 A1               0.0867
##  5 D4               0.0926
##  6 E5               0.139 
##  7 B3               0.102 
##  8 B2               0.125 
##  9 D4               0.0926
## 10 C11              0.127 
## # ... with 98,878 more rows
  1. Sort the data set chronologically by year and month. Then take the first 10.000 observations and compute the proportion of incidents where a shooting was involved for each district. Now take a random sample of size 10.000. Is there a noteworthy difference? Hint: There is a variable called SHOOTING.
crime  %>%
  arrange(desc(YEAR), desc(MONTH)) %>%
  slice(1:10000) %>%
  mutate(SHOOTING = ifelse(is.na(SHOOTING), 0, SHOOTING)) %>%
  summarise(porpotion = mean(SHOOTING == "Y"))
## # A tibble: 1 x 1
##   porpotion
##       <dbl>
## 1    0.0027
set.seed(1245)

crime  %>%
  slice_sample(n = 10000) %>%
  mutate(SHOOTING = ifelse(is.na(SHOOTING), 0, SHOOTING)) %>%
  summarise(porpotion = mean(SHOOTING == "Y"))
## # A tibble: 1 x 1
##   porpotion
##       <dbl>
## 1    0.0043
  1. Compute the TB rate for table2, table4a and table4b.
table2[table2$type == "cases", "count"] / table2[table2$type == "population", "count"]
##          count
## 1 0.0000372741
## 2 0.0001294466
## 3 0.0002193930
## 4 0.0004612363
## 5 0.0001667495
## 6 0.0001669488
table4a[ ,-1] / table4b[ ,-1]
##           1999         2000
## 1 0.0000372741 0.0001294466
## 2 0.0002193930 0.0004612363
## 3 0.0001667495 0.0001669488
  1. Recreate the plot given by the code below using table2 instead of table1.
ggplot(table1, aes(year, cases)) + 
  geom_line(aes(group = country), colour = "grey50") + 
  geom_point(aes(colour = country))

table2 %>% 
  filter(type == "cases") %>%
  ggplot( aes(x = year, y = count)) +
    geom_line(aes( group = country)) +
    geom_point(aes(color = country))